Q1

First I load the data files

Next I reshape dat into the handful data format. I use tidyr package for the aim instead of reshape package because the foramer is the improved version of the latter.

install.packages('tidyverse')
 URL 'https://cran.rstudio.com/bin/macosx/mavericks/contrib/3.3/tidyverse_1.1.1.tgz' を試しています 
Content type 'application/x-gzip' length 37228 bytes (36 KB)
==================================================
downloaded 36 KB

The downloaded binary packages are in
    /var/folders/1w/nyst5xl16t329h4gpz0j49hm0000gn/T//Rtmp2exVAC/downloaded_packages
library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ----------------------------------------------------------------------------------------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District\ of\ Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New\ Hampshire,New\ Jersey,New\ Mexico,New\ York,North\ Carolina,North\ Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode\ Island,South\ Carolina,South\ Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West\ Virginia,Wisconsin,Wyoming)
 エラー:  想定外の入力です  in:
"dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District\"

Delete the puctuations.

Then I add two new columns to the above data frame.

d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"
d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat

Sort the data frame in the ascending order

dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat

Convert the above data frame into the matrix form

Drawing the chord diagram

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding = 20,groupThickness=.05,groupnameFontsize=10)
row names of the 'data' matrix differ from its column names or the 'groupNames' argument.

Q2

まだindividualを入れてないです。 Read “Stops On Lines” and the all GIS data of bus lines.

library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))

# とりあえずindividual2についてのみreadできることを確認。一旦放置してこれ以外でできるか試す。
I2 <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216","2")
I2.pj <- spTransform(I2, CRS("+proj=longlat +datum=WGS84"))

make Line_list

tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)
tmps <- list(tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)
Line_list <- list()
for (i in 1:5){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}

make new_id

pjs <- list(CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:5){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()
new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()
SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)
 SpatialLines(.) でエラー: 
  lines list not exclusively filled with Lines objects

make new lines and LA map

Q3

First I load the data.

library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)

Next make Bollinger’s bands for each exchange rate.

x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg

The drawing code is as follows.

hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())

hc

Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.

library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)

Next I make new columns as follows

flightff$Delay <- ffifelse(flightff$dep_delay > 0, 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)

I exclude the rows whose Delay values are NA and rename it to logitff. And then I split the dataset into train set and test set.

logitff <- flightff[!is.na(flightff$Delay),]

indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]

Logistic regression

summary(fit)
Large data regression model: bigglm(Delay ~ DepHour + Car + Night + Weekend, data = trainset, 
    family = binomial(), sandwich = TRUE)
Sample size =  229964 
               Coef    (95%     CI)     SE p
(Intercept) -1.8632 -1.8959 -1.8305 0.0164 0
DepHour      0.1108  0.1084  0.1133 0.0012 0
Car         -0.1073 -0.1255 -0.0891 0.0091 0
Night       -0.2672 -0.2968 -0.2376 0.0148 0
Weekend     -0.1737 -0.2025 -0.1450 0.0144 0
Sandwich (model-robust) standard errors

predict and make confusionmatrix in train_set

train_confusion
     
           0      1    Sum
  0   122907  19209 142116
  1    68525  19323  87848
  Sum 191432  38532 229964

predict and make confusionmatrix in test_set

test_confusion
     
          0     1   Sum
  0   50824  7149 57973
  1   31212  9372 40584
  Sum 82036 16521 98557

Draw ROC curve

Q5

First I load the data. And before using spark I delete the irrelevant columns.

Remove the observations satisfying the condition

Split this data into trainset and testset.

test
$test
NA

Use Decision tree

decision_tree <- train %>%
  ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%

Prediction

table(pred$BOROUGH, pred$prediction)
               
                    0     1     2     3     4
  BRONX             0     0     0  8876     0
  BROOKLYN      21181     0    24     0     1
  MANHATTAN         0 17944     0     0     0
  QUEENS           25     1 17777     0     0
  STATEN ISLAND    23     0     0     0  3169
---
title: "Big Data Analytics Assignment 1"
output: html_notebook
---

## Q1

First I load the data files
```{r}
dat <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/migration2012.csv")
head(dat)
```

```{r}
states <- read.csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/states_chord.csv")
states
```

Next I reshape dat into the handful data format. I use **tidyr package** for the aim instead of **reshape package** because the foramer is the improved version of the latter.

```{r}
install.packages('tidyverse')
library(tidyverse)
```

```{r}
dat <- dat %>%
tidyr::gather(key=From, value=value, Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,District.of.Columbia,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New.Hampshire,New.Jersey,New.Mexico,New.York,North.Carolina,North.Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode.Island,South.Carolina,South.Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West.Virginia,Wisconsin,Wyoming)
```

Delete the puctuations.

```{r}
dat <- data.frame(apply(dat, 2, function(y) gsub("[[:punct:]]", " ", y)))
dat
```

Then I add two new columns to the above data frame.

```{r}
d1 <- merge(dat, states, by.x="From", by.y="States")
d1 <- d1[,c(-5,-6)]
names(d1)[4] <- "ID_From"

d2 <- merge(d1, states, by.x="To", by.y="States")
d2 <- d2[,c(-6,-7)]
names(d2)[5] <- "ID_To"
dat <- d2
dat
```

Sort the data frame in the ascending order

```{r}
dat <- arrange(dat, ID_From)
dat <- arrange(dat, ID_To)
dat
```

Convert the above data frame into the matrix form

```{r}
t1 <- dat[,c(1,2,4,5)]
t1 <- t1 %>%
  spread(key=To, value=ID_To)

t2 <- dat[,c(1,2,3,4)]
t2 <- t2 %>%
  spread(key=To, value=value)

t3 <- rbind(t1[1,],t2)
t3$From <- as.character(t3$From)
t3[1,2] <- 0
t3[1,1] <- "ID_To"
t3 <- t3 %>%
  arrange(ID_From)

library(data.table)
setcolorder(t3,c("From","ID_From","Connecticut","Maine","Massachusetts","New Hampshire","Rhode Island",
"Vermont","New Jersey","New York","Pennsylvania","Illinois","Indiana",
"Michigan","Ohio","Wisconsin","Iowa","Kansas","Minnesota",
"Missouri","Nebraska","North Dakota","South Dakota","Delaware","Florida",
"Georgia","Maryland","North Carolina","South Carolina","Virginia","District of Columbia",
"West Virginia","Alabama","Kentucky","Mississippi","Tennessee","Arkansas",
"Louisiana","Oklahoma","Texas","Arizona","Colorado","Idaho",
"Montana","Nevada","New Mexico","Utah","Wyoming","Alaska",
"California","Hawaii","Oregon","Washington"))

t3 <- t3[c(-1),c(-2)]

t4 <- t3[,-1]
rownames(t4) <- t3[,1]

t5 <- data.frame(apply(t4, 2, function(y) as.numeric(y)))
rownames(t5) <- t3[,1]
t5
```

Drawing the chord diagram
```{r}
library(chorddiag)

chorddiag(as.matrix(t5),groupColors=states$Color,showTicks=F,groupnamePadding=20,groupThickness=.05,groupnameFontsize=10)
```


## Q2

まだindividualを入れてないです。
Read "Stops On Lines" and the all GIS data of bus lines.
```{r}
library(dplyr)
library(sp)
library(rgdal)
library(leaflet)
library(ggmap)

# Bus Stops
SOL <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/BusStops1216","StopsOnLines1216")
SOL.pj <- spTransform(SOL, CRS("+proj=longlat +datum=WGS84"))

# Bus Routes
CC <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/ComCir1216","ComCir1216")
CC.pj <- spTransform(CC, CRS("+proj=longlat +datum=WGS84"))
LE <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LimExp1216","LimExp1216")
LE.pj <- spTransform(LE, CRS("+proj=longlat +datum=WGS84"))
LCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalCBD1216","LocalCBD1216")
LCBD.pj <- spTransform(LCBD, CRS("+proj=longlat +datum=WGS84"))
LNCBD <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/LocalNonCBD1216","LocalNonCBD1216")
LNCBD.pj <- spTransform(LNCBD, CRS("+proj=longlat +datum=WGS84"))
RBRT <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/RapidBRT1216","RapidBRT1216")
RBRT.pj <- spTransform(RBRT, CRS("+proj=longlat +datum=WGS84"))

# とりあえずindividual2についてのみreadできることを確認。一旦放置してこれ以外でできるか試す。
I2 <- readOGR("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/Individuals1216","2")
I2.pj <- spTransform(I2, CRS("+proj=longlat +datum=WGS84"))
```

make Line_list
```{r}
tmp_CC <- geometry(CC.pj)
tmp_LE <- geometry(LE.pj)
tmp_LCBD <- geometry(LCBD.pj)
tmp_LNCBD <- geometry(LNCBD.pj)
tmp_RBRT <- geometry(RBRT.pj)
tmps <- list(tmp_CC, tmp_LE, tmp_LCBD, tmp_LNCBD, tmp_RBRT)

Line_list <- list()
for (i in 1:5){
  for (j in 1:length(tmps[[i]])){
    Line_list <- c(Line_list, tmps[[i]][j]@lines[[1]]@Lines)
  }
}
```


make new_id
```{r}
pjs <- list(CC.pj, LE.pj, LCBD.pj, LNCBD.pj, RBRT.pj)
LinLSs <- list()
for (i in 1:5){
 LinLSs <- c(LinLSs, sapply(pjs[[i]]@lines, function(x) length(x@Lines)))
}
LinLSs <- LinLSs %>% unlist()

new_id <- sapply(1:length(LinLSs), function(x) paste0(x, "_", seq.int(LinLSs[[x]]))) %>% 
  unlist()
```

```{r}
## make a new data.frame (only route_id)
DAT=data.frame(matrix(rep(NA,1),nrow=1))[-1,]
for (i in 1:5){
  df <- data.frame(route_id = pjs[[i]]@data$VAR_IDENT)
  DAT <- rbind(DAT, df)
}
rownames(DAT) <- new_id

SLDF <- mapply(function(x, y) Lines(x, ID = y), x = Line_list, y = new_id) %>%
  #list() %>%
  SpatialLines() %>% 
  SpatialLinesDataFrame(data = DAT)
```


make new lines and LA map
```{r}
dat <- geocode('Los Angels')

leaflet() %>%
  setView(lng = dat['lon'], lat = dat['lat']	, zoom = 11) %>%
  addPolylines(data = SLDF, color = "black", opacity = 1, weight = 1) %>% 
  addCircles(data=SOL.pj@data,~LONG, ~LAT, color = "red", weight = 0.3) %>%
  addTiles()
```


## Q3

First I load the data.
```{r}
library(quantmod)
library(highcharter)

x <- getSymbols("AUD/JPY", src = "oanda", auto.assign = FALSE)
y <- getSymbols("GBP/USD", src = "oanda", auto.assign = FALSE)
```

Next make Bollinger's bands for each exchange rate.
```{r}
x.BBands.ll <- BBands(x)$dn
x.BBands.ul <- BBands(x)$up
x.BBands.m <- BBands(x)$mavg
y.BBands.ll <- BBands(y)$dn
y.BBands.ul <- BBands(y)$up
y.BBands.m <- BBands(y)$mavg
```

The drawing code is as follows.
```{r}
hc <- highchart(type="stock") %>% 
  hc_title(text="Charting Exchange Rates") %>% 
  hc_subtitle(text = "Data extracted using quantmod package") %>% 
  hc_yAxis_multiples(
    list(top = "0%", height = "50%", offset=0, opposite=TRUE),
    list(top = "50%", height = "50%", offset=0, opposite=TRUE)
  )%>%
  hc_add_series(x, id = "audjpy",name ="audjpy", yAxis=0, color="blue", lineWidth=1.5) %>%
  hc_add_series(x.BBands.ll, id = "audjpy.ll", name="audjpy Lower BBands",yAxis=0,
                color="black",dashStyle='shortdash', lineWidth=1) %>%
  hc_add_series(x.BBands.ul, id = "audjpy.ul", name="audjpy Upper BBands",yAxis=0,
                color="black",lineWidth=1) %>%
  hc_add_series(x.BBands.m, id = "audjpy.m",name="audjpy BBands MA", yAxis=0,
                color="red",lineWidth=1) %>%
  hc_add_series(y, id = "gbpusd",name="gbpusd",yAxis=1, color="green", lineWidth=1.5) %>%
  hc_add_series(y.BBands.ll, id = "gbpusd.ll",name="gbpusd Lower BBands", yAxis=1,
                color="black",dashStyle='shortdash',lineWidth=1) %>%
  hc_add_series(y.BBands.ul, id = "gbpusd.ul",name="gbpusd Upper BBands", yAxis=1,
                color="black",lineWidth=1) %>%
  hc_add_series(y.BBands.m, id = "gbpusd.m",name="gbpusd BBands MA", yAxis=1,
                color="red",lineWidth=1) %>%
  hc_add_theme(hc_theme_538())

hc
```


## Q4

Load libraries and check the raw data. And make ffdf after converting character columns to factor columns in original df.
```{r}
library(nycflights13)
library(ffbase)
library(ffbase2)
library(biglm)
library(pROC)
library(chron)

tmp <- flights
tmp$carrier <- as.factor(tmp$carrier)
tmp$tailnum <- as.factor(tmp$tailnum)
tmp$origin <- as.factor(tmp$origin)
tmp$dest <- as.factor(tmp$dest)

flightff <- as.ffdf(tmp)
```

Next I make new columns as follows
```{r}
flightff$Delay <- ffifelse(flightff$dep_delay > 0, 1,0)
flightff$DepHour <- flightff$hour
flightff$Car <- ffifelse(flightff$carrier %in% as.factor(c("DL","US","DH","UA")), 1, 0)
flightff$Night <- ffifelse(flightff$hour > 18 | flightff$hour < 6, 1, 0)
flightff$Weekend <- ffifelse(day.of.week(month=flightff$month, day=flightff$day, year=flightff$year) == 6, 1, 0)
```

I exclude the rows whose Delay values are NA and rename it to logitff.
And then I split the dataset into train set and test set.
```{r}
logitff <- flightff[!is.na(flightff$Delay),]

indx <- ff(1:nrow(logitff))
p <- 0.7
trainIndx <- ff(indx[1:trunc(length(indx)*p)])
trainset <- logitff[trainIndx,]
testIndx <- ff(indx[(trunc(length(indx)*p)+1):length(indx)])
testset <- logitff[testIndx,]
```

Logistic regression 
```{r}
fit <- bigglm.ffdf(Delay~DepHour+Car+Night+Weekend, data = trainset, family=binomial(), sandwich=TRUE)
summary(fit)
```

predict and make confusionmatrix in train_set
```{r}
train_pred <- predict(fit, newdata = trainset, type="response")
train_pred <- ifelse(train_pred>0.5, 1,0)
train_confusion <- table(as.integer(as.data.frame(trainset)$Delay), as.integer(train_pred))
train_confusion <- addmargins(train_confusion)
train_confusion
```

predict and make confusionmatrix in test_set
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
test_pred <- ifelse(test_pred>0.5, 1,0)
test_confusion <- table(as.integer(as.data.frame(testset)$Delay), as.integer(test_pred))
test_confusion <- addmargins(test_confusion)
test_confusion
```

Draw ROC curve
```{r}
test_pred <- predict(fit, newdata = testset, type="response")
roc <- roc(as.integer(as.data.frame(testset)$Delay), as.numeric(test_pred))
plot(roc)
```


## Q5

First I load the data. And before using spark I delete the irrelevant columns.
```{r}
library(sparklyr)
library(dplyr)
library(readr)
sc <- spark_connect(master = "local")

# 元のcsvに問題があるっぽい？ → Unspecifiedが怒られてるっぽい　→　Unspecifiedを0に変えるか。→今度はPASSENGER VEHICKEが怒られる。
# 要するにStringが全部ダメっぽい。でもtitanicではStringも上手く渡せてる。なぜ？
# 二つの例から空白は別に問題ではないことがわかる。
dat <- read_csv("/Users/susu/Desktop/Hong\ Kong/Semester2/Big_Data/assignment_data/as1/NYPD_Motor_Vehicle_Collisions.csv")
dat <- dat[,c("BOROUGH","LATITUDE","LONGITUDE","UNIQUE KEY")]
nypd_tbl <- copy_to(sc, dat, "nypd_tbl",overwrite = TRUE)
```

Remove the observations satisfying the condition
```{r}
nypd_tbl <- nypd_tbl %>%
  filter(BOROUGH!="",!is.na(LATITUDE),!is.na(LONGITUDE),LATITUDE!=0,LONGITUDE!=0)
nypd_tbl
```

Split this data into trainset and testset.
```{r}
partitions <- nypd_tbl %>%
  sdf_partition(training = 0.9, test = 0.1, seed = 123)
train <- partitions[1]$training
test <- partitions[2]$test
```

Use Decision tree
```{r}
decision_tree <- train %>%
  ml_decision_tree(response="BOROUGH", features = c("LATITUDE","LONGITUDE"), max.bins = 200L, max.depth = 10L, seed=123L) %>%
```

Prediction
```{r}
pred <- sdf_predict(decision_tree, test) %>%
  collect

table(pred$BOROUGH, pred$prediction)
```

































